Read in data

cellsdata = Read10X("/Users/tfriedrich/Downloads/FC_102418_with_markers/outs/filtered_feature_bc_matrix/")
cells = CreateSeuratObject(counts = cellsdata, project = "FC", min.cells = 3, min.features = 200)

Number of each cell showing expression for each marker.

length(which(cellsdata["EGFP.dna",]>0))
## [1] 446
length(which(cellsdata["lacZ.dna",]>0))
## [1] 9
length(which(cellsdata["mCerulean.dna",]>0))
## [1] 59
length(which(cellsdata["mCherry.dna",]>0))
## [1] 7
length(which(cellsdata["mOrange.dna",]>0))
## [1] 14
length(which(cellsdata["tdrfp.dna",]>0))
## [1] 496

Write cells expressing markers of interest to a file.

A = subset(subset(cells, lacZ.dna>0) ,Xist>0)
B = subset(subset(cells, lacZ.dna==0) ,Xist>0)
C = subset(subset(cells, tdrfp.dna>0) ,Xist=0)
tdrfp = subset(cells, tdrfp.dna>0)
lacZ = subset(cells, lacZ.dna>0)
mOrange = subset(cells, mOrange.dna>0)
mCerulean = subset(cells, mCerulean.dna>0)
mCherry = subset(cells, mCherry.dna>0)

write.csv(x = GetAssayData(A), file = "lacZ_positive_and_Xist_positive.csv", quote=FALSE)
write.csv(x = GetAssayData(B), file = "lacZ_negative_and_Xist_positive.csv", quote=FALSE)
write.csv(x = GetAssayData(C), file = "tdrfp_positve_and_Xist_negative.csv", quote=FALSE)

write.csv(x = GetAssayData(tdrfp), file = "tdrfp_positive.csv", quote=FALSE)
write.csv(x = GetAssayData(lacZ), file = "lacZ_positive.csv", quote=FALSE)
write.csv(x = GetAssayData(mOrange), file = "mOrange_positve.csv", quote=FALSE)
write.csv(x = GetAssayData(mCerulean), file = "mCerulean_positve.csv", quote=FALSE)
write.csv(x = GetAssayData(mCherry), file = "mCherry_positve.csv", quote=FALSE)

Gather scRNA-seq QC metric.

mito.genes <- grep(pattern = "^mt-", x = rownames(x = cells), value = TRUE)
percent.mito <- Matrix::colSums(cells[mito.genes, ])/Matrix::colSums(cells)

AddMetaData adds columns to object@meta.data, and is a great place to stash QC stats

cells <- AddMetaData(object = cells, metadata = percent.mito, col.name = "percent.mito")

Plot expressino of marker genes

VlnPlot(object = cells, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"))

marker_genes = c("Hnf4a", "Alb", "Fah", "Krt8", "Krt18", "G6pc", "Sox9", "Krt7", "Krt19", "Epcam", "Onecut1", "Afp", "Xist")
VlnPlot(A,c("Hnf4a", "Alb", "Fah"))
## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE],
## idents = idents, : All cells have the same value of Alb.
## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE],
## idents = idents, : All cells have the same value of Fah.

VlnPlot(B,c("Hnf4a", "Alb", "Fah"))

VlnPlot(C,c("Hnf4a", "Alb", "Fah"))

cells <- NormalizeData(object = cells, normalization.method = "LogNormalize", 
    scale.factor = 10000)

vst: First, fits a line to the relationship of log(variance) and log(mean) using local polynomial regression (loess). Then standardizes the feature values using the observed mean and expected variance (given by the fitted line). Feature variance is then calculated on the standardized values after clipping to a maximum (see clip.max parameter).

cells <- FindVariableFeatures(object = cells, selection.method = "vst", nfeatures = 2000)

Regress out expression effects confounded by nUMI and percent.mito

cells <- ScaleData(object = cells, vars.to.regress = c("nUMI", "percent.mito"))
## Regressing out nUMI, percent.mito
## Centering and scaling data matrix

Regress out cell cycle genes. For each gene, Seurat models the relationship between gene expression and the S and G2M cell cycle scores. The scaled residuals of this model represent a ‘corrected’ expression matrix, that can be used downstream for dimensional reduction.

s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes

cells <- CellCycleScoring(object=cells, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
## Warning in AddModuleScore(object = object, features = features, name =
## name, : Could not find enough features in the object from the following
## feature lists: S.Score Attempting to match case...Could not find enough
## features in the object from the following feature lists: G2M.Score
## Attempting to match case...
cells <- RunPCA(object = cells, pc.genes = cells@var.genes, do.print = TRUE, pcs.print = 1:5, genes.print = 5)
## PC_ 1 
## Positive:  Igfbp7, Spp1, Cp, Xist, Trf, Lcn2, Gc, C3, Ifitm3, Tmem176b 
##     Mgst1, Stmn1, Hmgn2, Tesc, Tceal9, Hpn, Selenoh, Cdk4, Tmsb10, Ptma 
##     Hnrnpa1, Slc25a4, Tmem176a, Npm1, Hmgn1, B2m, Nme1, Cald1, Snhg18, Cxcl5 
## Negative:  Lgals4, Plac8, Phgr1, Ctse, Gsta1, Gcnt3, Gm3336, Tff3, Psca, Spink4 
##     Lgals3, Cyp2s1, Msln, Txnip, Ly6a, AA467197, Duox2, Gsto1, 2210407C18Rik, Duoxa2 
##     Crip1, Cyp3a13, Fkbp5, Krt20, Cbr1, Gm10260, Ahnak, S100a14, Akr1b7, Rbp2 
## PC_ 2 
## Positive:  Wfdc2, Birc5, Hmgb2, Cdca8, Ccnb2, Pclaf, Ccna2, Cdca3, Cdk1, Top2a 
##     Cenpa, Nusap1, Ube2c, Gpx2, Pbk, Cenpf, Racgap1, Esco2, Aurkb, Ccnb1 
##     Spc24, Hist1h2ap, Cdc20, Car2, Pimreg, Prc1, Smc2, Cks2, Sftpd, Knstrn 
## Negative:  Ccdc80, Aebp1, Col1a2, Sparc, Vim, Axl, Serping1, Lhfp, Col3a1, Bgn 
##     Ptgis, Nnmt, Rcn3, Loxl1, Col1a1, Lpl, Emilin1, Mxra8, Col5a2, Dcn 
##     Lgals1, Ak1, Gpx8, Fstl1, Col6a1, Colec12, Lrp1, Lbh, Gm12840, Cygb 
## PC_ 3 
## Positive:  Alb, Fga, Cfi, Agt, Itih2, Ttc36, Lrg1, Tns1, St3gal5, Cp 
##     C3, Hpn, Fgg, Azgp1, Nrn1, Pah, Osgin1, Fgb, Lcn2, Ambp 
##     Spp1, Gstm1, Dcdc2a, Ces1d, Trf, Gas6, Slc5a1, Scn1b, Mgst1, Tmem176b 
## Negative:  Birc5, Pclaf, Pbk, Ccna2, Cdca8, Cdca3, Cdk1, Ube2c, Aurkb, Top2a 
##     Hist1h2ap, Nusap1, Spc24, Ccnb1, Mki67, Racgap1, Cenpa, Cdc20, Smc2, Ccnb2 
##     Tk1, Cenpf, Pimreg, Prc1, Esco2, Spc25, Cenpm, Plk1, Hmmr, Cdkn3 
## PC_ 4 
## Positive:  Wfdc17, Lyz2, Fcer1g, Ccl6, Tyrobp, Ctss, Cd68, Apoe, Fcgr3, Laptm5 
##     Ms4a6d, Trem2, Cd53, C1qb, C1qa, Mpeg1, C1qc, Plek, Pla2g7, Gpnmb 
##     Selenop, Fcgr2b, Pirb, Cybb, C5ar1, Lypd8, Lilr4b, Clec4d, Pmp22, Rac2 
## Negative:  Ccnd1, Tnfrsf12a, Basp1, Krt7, Wfdc2, Plaur, Tubb2b, Lor, Sprr1a, Phgdh 
##     Cd44, Tpd52l1, Krt23, Hspb1, Serpinb6b, Plat, Cdh13, Cxcl16, Ctgf, Tacstd2 
##     Anxa1, Cryab, Edn1, Gprc5a, Cldn4, Cdk6, Pbx1, Msln, Phlda1, Nupr1 
## PC_ 5 
## Positive:  Lyz2, Fcer1g, Wfdc17, Tyrobp, Ctss, Ccl6, Fcgr3, Laptm5, Ms4a6d, Trem2 
##     Plek, Pla2g7, Cd53, Adam8, Gpnmb, C1qb, Pirb, C1qc, Mpeg1, Fcgr2b 
##     C5ar1, C1qa, Lilr4b, Clec4d, Cd84, Rac2, Cybb, Pmp22, Rgs1, Csf1r 
## Negative:  Ttr, Tff3, Lypd8, Ambp, AA467197, 2210407C18Rik, Akr1b7, Lrg1, Spink4, Fga 
##     Dmbt1, Upp1, Alb, Sptssb, Pigr, Fcgbp, Clca1, Sult1d1, Pah, Cyp2c65 
##     Gm3336, Sult1c2, Cfi, Lgals4, Serping1, Fgg, AC102496.1, Ret, Snhg18, Txnip
ElbowPlot(cells)

Highly variable genes.

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(cells), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(cells)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot2

Visualize using UMAP. Colored by cell state.

cells <- RunUMAP(object = cells, dims = 1:10)
#pdf("FeaturePlots_FC.pdf")
DimPlot(object = cells, reduction = "umap")

FeaturePlot(cells, features = marker_genes[1:4])

FeaturePlot(cells, features = marker_genes[5:8])

FeaturePlot(cells, features = marker_genes[9:12])

FeaturePlot(cells, features = marker_genes[13])

FeaturePlot(cells, features = c("EGFP.dna","lacZ.dna","mCerulean.dna","mCherry.dna"))

FeaturePlot(cells, features = c("mCherry.dna","mOrange.dna","tdrfp.dna"))

#dev.off()
#Extract data, phenotype data, and feature data from the SeuratObject
#https://github.com/cole-trapnell-lab/monocle-release/issues/262
data <- as(as.matrix(cells@assays$RNA@data), 'sparseMatrix')
pd <- new('AnnotatedDataFrame', data = cells@meta.data)
fData <- data.frame(gene_short_name = row.names(data), row.names = row.names(data))
fd <- new('AnnotatedDataFrame', data = fData)
monocle_cds <- newCellDataSet(data,
                         phenoData = pd,
                         featureData = fd,
                         lowerDetectionLimit = 0.5,
                         expressionFamily = negbinomial.size())

# generate size factors for normalization later
monocle_cds <- estimateSizeFactors(monocle_cds)

# markers for common cell types
marker_file_path <-  "/Users/tfriedrich/Downloads/markers_garnett.txt"
marker_check <- check_markers(monocle_cds, marker_file_path,
                              db=org.Mm.eg.db,
                              cds_gene_id_type = "SYMBOL",
                              marker_file_gene_id_type = "SYMBOL")
## There are 3 cell type definitions
plot_markers(marker_check)
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missing values (geom_text_repel).

# train cells using marker genes as labels Garnett classification is trained using a multinomial elastic-net regression. This means that certain genes are chosen as the relevant genes for distinguishing between cell types. Which genes are chosen may be of interest, so Garnett includes a function to access the chosen genes. Note: Garnett does not regularize the input markers, so they will be included in the classifier regardless.

set.seed(260)
pbmc_classifier <- train_cell_classifier(cds = monocle_cds,
                                         marker_file = marker_file_path,
                                         db=org.Mm.eg.db,
                                         cds_gene_id_type = "SYMBOL",
                                         num_unknown = 50,
                                         marker_file_gene_id_type = "SYMBOL")
## There are 3 cell type definitions
## Warning in make_name_map(parse_list,
## as.character(row.names(fData(norm_cds))), : 5 genes could not be converted
## from SYMBOL to ENSEMBL These genes are listed below:
## Warning in make_name_map(parse_list, as.character(row.names(fData(norm_cds))), : The following genes from the cell type definition file are notpresent in the cell dataset.  Please check these genes for errors.Cell type determination will continue, ignoring these genes.
## Opn
## Cypa1
## Cyp3a4
## Aat
## Cyp3a7
## training_sample
## cholangiocyte   hepatoblast    hepatocyte       Unknown 
##           500             8           500            50
## Model training finished.
head(pData(monocle_cds))
##                  orig.ident nCount_RNA nFeature_RNA percent.mito
## AAACCTGAGACTGGGT         FC      12250         3112  0.009061224
## AAACCTGAGCACACAG         FC      47677         6523  0.012165195
## AAACCTGAGTTAACGA         FC      14352         3506  0.005643813
## AAACCTGCAAGGTGTG         FC      27491         5257  0.013822706
## AAACCTGCATGATCCA         FC      18650         3994  0.006702413
## AAACCTGCATGCCACG         FC      53461         6647  0.009726717
##                        S.Score   G2M.Score Phase old.ident Size_Factor
## AAACCTGAGACTGGGT  9.873166e-05 -0.05839754     S        FC   1.2089628
## AAACCTGAGCACACAG -3.734501e-02 -0.04636801    G1        FC   0.9440832
## AAACCTGAGTTAACGA -2.359718e-02 -0.07000838    G1        FC   1.3806441
## AAACCTGCAAGGTGTG -2.649230e-02 -0.07620632    G1        FC   1.1419254
## AAACCTGCATGATCCA -3.156153e-02 -0.02979146    G1        FC   0.8796619
## AAACCTGCATGCCACG  2.837445e-01  0.51890682   G2M        FC   0.9898649
feature_genes <- get_feature_genes(pbmc_classifier,
                                   node = "root",
                                   db = org.Mm.eg.db, 
                                   convert_ids = TRUE)
head(feature_genes)
##             cholangiocyte   hepatoblast   hepatocyte       Unknown
## (Intercept)    1.77910158 -4.430007e+00  1.180655465  1.4702498367
## Imp4          -0.02799056 -6.411412e-05 -0.002402989  0.0304576677
## Plekhb2        0.02290446 -1.202628e-03 -0.021853171  0.0001513424
## Txndc9         0.02469316  3.076515e-03  0.067396988 -0.0951666603
## Pdcl3         -0.03381068  1.768482e-03 -0.002048701  0.0340908992
## Cavin2         0.01853130 -3.062922e-04  0.018786126 -0.0370111352
write.table(feature_genes, "/Users/tfriedrich/Downloads/garnett_elastic_regression_feature_genes.xls", sep="\t", col.names=NA)

DE <- FindMarkers(cells, ident.1 = "G1", ident.2 = "S")